import numpy as np
import pandas as pd
from itertools import cycle
from scipy import interp
# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
# TensorFlow
import tensorflow as tf
# Timer
from timeit import default_timer as timer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we demonstrate solving a classification problem in TensorFlow using Estimators using the Heart Disease Dataset from the UCI Machine Learning Repository.

Picture Source: harvard.edu
Developing a predictive model that can predict whether heart disease is present or absent based on the rest of the given features.
Data = np.genfromtxt('heart-disease/heart.dat', delimiter=' ')
Attributes = ['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure', 'Serum Cholestoral',
'Fasting Blood Sugar', 'Resting Electrocardiographic Results', 'Maximum Heart Rate Achieved',
'Exercise Induced Angina', 'Oldpeak', 'Slope',
'Number of Major Vessels', 'Thal', 'Heart Disease']
Data = pd.DataFrame(data = Data, columns = Attributes)
#
Temp = ['Sex', 'Chest Pain Type', 'Fasting Blood Sugar', 'Resting Electrocardiographic Results',
'Exercise Induced Angina', 'Slope', 'Number of Major Vessels','Thal']
for c in Temp:
Data[c] = Data[c].astype(int).astype(str)
del Temp, c
Target = 'Heart Disease'
Labels_dict = dict(zip([0,1],['Absent', 'Present']))
Data['Heart Disease'] = (Data['Heart Disease']-1).astype(int)
#
display(Data.head(5))
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
# Maps
Maps = {'Sex': {'0':'Female', '1':'Male'},
'Chest Pain Type': {'1':'Typical Angina', '2':'Atypical Angina', '3': 'Non-Anginal Pain', '4':'Asymptomatic'},
'Fasting Blood Sugar': {'0': 'False', '1': 'True'}, 'Exercise Induced Angina': {'0': 'No', '1': 'Yes'},
'Slope': {'1': 'Upsloping', '2': 'Flat', '3': 'Downsloping'},
'Thal': {'3': 'Normal', '6': 'Fixed Defect','7': 'Reversable Defect'}}
| Age | Sex | Chest Pain Type | Resting Blood Pressure | Serum Cholestoral | Fasting Blood Sugar | Resting Electrocardiographic Results | Maximum Heart Rate Achieved | Exercise Induced Angina | Oldpeak | Slope | Number of Major Vessels | Thal | Heart Disease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 70.0 | 1 | 4 | 130.0 | 322.0 | 0 | 2 | 109.0 | 0 | 2.4 | 2 | 3 | 3 | 1 |
| 1 | 67.0 | 0 | 3 | 115.0 | 564.0 | 0 | 2 | 160.0 | 0 | 1.6 | 2 | 0 | 7 | 0 |
| 2 | 57.0 | 1 | 2 | 124.0 | 261.0 | 0 | 0 | 141.0 | 0 | 0.3 | 1 | 0 | 7 | 1 |
| 3 | 64.0 | 1 | 4 | 128.0 | 263.0 | 0 | 0 | 105.0 | 1 | 0.2 | 2 | 1 | 7 | 0 |
| 4 | 74.0 | 0 | 2 | 120.0 | 269.0 | 0 | 2 | 121.0 | 1 | 0.2 | 1 | 1 | 3 | 0 |
| Number of Instances | Number of Attributes |
|---|---|
| 270 | 14 |
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'Heart Disease Dataset', W = 800)
# A copy of the Dataframe
df = Data.copy()
# for TF
df.columns = [x.replace(' ','_') for x in df.columns]
Temp = Target.replace(' ','_')
X = df.drop(columns = Temp)
y = df[Temp].values
del df, Temp
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen','FireBrick'],
TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.20, 0.12, 0.15],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
# For Tensorflow
X.columns = [x.replace(' ','_') for x in X.columns]
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
The input function specifies how data is converted to a tf.data.Dataset that feeds the input pipeline in a streaming fashion. Moreover, an input function is a function that returns a tf.data.Dataset object which outputs the following two-element tuple:
def input_fn(features, labels, training=True, batch_size=256):
"""An input function for training or evaluating"""
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
# Shuffle and repeat if you are in training mode.
if training:
dataset = dataset.shuffle(1000).repeat()
return dataset.batch(batch_size)
Moreover, an estimator model consists of two main parts, feature columns, and a numeric vector. Feature columns provide explanations for the input numeric vector. The following function separates categorical and numerical columns (features)and returns a descriptive list of feature columns.
def Feat_Columns(Inp):
Temp = Inp.dtypes.reset_index(drop = False)
Temp.columns = ['Features', 'Data Type']
Temp['Data Type'] = Temp['Data Type'].astype(str)
# Numeric_Columns
Numeric_Columns = Temp.loc[Temp['Data Type'].isin(['int64', 'int32', 'float64', 'float32']),'Features'].tolist()
# Categorical_Columns
Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
# Feature Columns
feature_columns = []
if len(Categorical_Columns)>0:
for feature_name in Categorical_Columns:
vocabulary = Inp[feature_name].unique()
feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
if len(Numeric_Columns)>0:
for feature_name in Numeric_Columns:
feature_columns.append(tf.feature_column.numeric_column(feature_name))
return feature_columns
my_feature_columns = Feat_Columns(X)
tf.keras.backend.clear_session()
IT = int(5e3)
# Classifier
classifier = tf.estimator.BoostedTreesClassifier(feature_columns=my_feature_columns,
n_batches_per_layer= 1,
n_classes= len(Labels_dict),
learning_rate=0.1)
# Training
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8272 | 0.5556 | 0.8583 | 0.8505 | 1.5855 | 0.4444 | 1.5855 | 0.8056 | 0.44 | 0.8056 | 600 |
def ROC_Curve(y_test, probs, n_classes, FS = 7, ax = False, pad = 0.01):
# converting y_test to categorical
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=n_classes, dtype='float32')
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = metrics.roc_curve(y_test_cat[:, i], probs[:, i])
roc_auc[i] = metrics.auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test_cat.ravel(), probs.ravel())
roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], name = 'FPR = TPR', line = dict(color='Black', width=2, dash='dash')))
fig.add_trace(go.Scatter(x=fpr["micro"], y=tpr["micro"], mode='lines', marker_color = 'deeppink',
name='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"])))
fig.add_trace(go.Scatter(x=fpr["macro"], y=tpr["macro"], mode='lines', marker_color = 'navy',
name='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"])))
colors = cycle(['Aqua', 'DarkOrange', 'CornflowerBlue'])
for i, color in zip(range(n_classes), colors):
_ = fig.add_trace(go.Scatter(x = fpr[i], y = tpr[i], mode='lines', marker_color= px.colors.sequential.Rainbow,
name='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])))
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range =[-pad, 1+pad],
title = 'False Positive Rate (FPR)')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range =[-pad, 1+pad],
title = 'True Positive Rate (TPR)')
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.update_layout(height = 600, width = 810)
fig.update_layout(title={'text': '<b>' + 'Receiver Operating Characteristic (ROC) Curves' + '<b>', 'x': .5,
'y': .9, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
The confusion matrix allows for visualization of the performance of an algorithm. Note that due to the size of data, here we don't provide a Cross-validation evaluation. In general, this type of evaluation is preferred.
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(PD['Labels'])
_ = a.yaxis.set_ticklabels(PD['Labels'])
_ = a.set_aspect(1)
# Train
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_train, y_train, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (12, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 1.000000 | 1.000000 | 1.000000 | 105.000000 |
| Present | 1.000000 | 1.000000 | 1.000000 | 84.000000 |
| accuracy | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| macro avg | 1.000000 | 1.000000 | 1.000000 | 189.000000 |
| weighted avg | 1.000000 | 1.000000 | 1.000000 | 189.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.844444 | 0.844444 | 0.844444 | 45.000000 |
| Present | 0.805556 | 0.805556 | 0.805556 | 36.000000 |
| accuracy | 0.827160 | 0.827160 | 0.827160 | 0.827160 |
| macro avg | 0.825000 | 0.825000 | 0.825000 | 81.000000 |
| weighted avg | 0.827160 | 0.827160 | 0.827160 | 81.000000 |
Lasso (least absolute shrinkage and selection operator) classifier was introduced within the context of the method of least squares. Lasso) alters the model fitting process to pick only a subset of the provided covariates to be used within the final model instead of using all of them and this will improve the prediction accuracy and interpretability of regression models.
tf.keras.backend.clear_session()
IT = int(5e3)
# Classifier
classifier = tf.estimator.BoostedTreesClassifier(feature_columns=my_feature_columns,
n_batches_per_layer= 1,
n_classes= len(Labels_dict),
n_trees=120,
max_depth=5,
learning_rate=0.1,
l1_regularization= 1e-3)
# Training
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8519 | 0.5556 | 0.8932 | 0.8604 | 0.5694 | 0.4444 | 0.5694 | 0.8 | 0.4993 | 0.8889 | 600 |
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
# Train
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_train, y_train, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (12, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 1.000000 | 1.000000 | 1.000000 | 105.000000 |
| Present | 1.000000 | 1.000000 | 1.000000 | 84.000000 |
| accuracy | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| macro avg | 1.000000 | 1.000000 | 1.000000 | 189.000000 |
| weighted avg | 1.000000 | 1.000000 | 1.000000 | 189.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.902439 | 0.822222 | 0.860465 | 45.000000 |
| Present | 0.800000 | 0.888889 | 0.842105 | 36.000000 |
| accuracy | 0.851852 | 0.851852 | 0.851852 | 0.851852 |
| macro avg | 0.851220 | 0.855556 | 0.851285 | 81.000000 |
| weighted avg | 0.856911 | 0.851852 | 0.852305 | 81.000000 |
tf.keras.backend.clear_session()
IT = int(5e3)
# Classifier
classifier = tf.estimator.BoostedTreesClassifier(feature_columns=my_feature_columns,
n_batches_per_layer= 1,
n_classes= len(Labels_dict),
n_trees=120,
max_depth=5,
learning_rate=0.1,
l2_regularization= 1e-3)
# Training
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8765 | 0.5556 | 0.879 | 0.8448 | 0.691 | 0.4444 | 0.691 | 0.8421 | 0.474 | 0.8889 | 600 |
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
# Train
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_train, y_train, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (12, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 1.000000 | 1.000000 | 1.000000 | 105.000000 |
| Present | 1.000000 | 1.000000 | 1.000000 | 84.000000 |
| accuracy | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| macro avg | 1.000000 | 1.000000 | 1.000000 | 189.000000 |
| weighted avg | 1.000000 | 1.000000 | 1.000000 | 189.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.906977 | 0.866667 | 0.886364 | 45.000000 |
| Present | 0.842105 | 0.888889 | 0.864865 | 36.000000 |
| accuracy | 0.876543 | 0.876543 | 0.876543 | 0.876543 |
| macro avg | 0.874541 | 0.877778 | 0.875614 | 81.000000 |
| weighted avg | 0.878145 | 0.876543 | 0.876809 | 81.000000 |